/*
 *  Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  See the NOTICE file distributed with
 *  this work for additional information regarding copyright ownership.
 *  The ASF licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
/**
 * @author Nikolay A. Sidelnikov
 */

#include "Ia32IRManager.h"
#include "Ia32Inst.h"
#include "Dominator.h"
#include "Ia32CgUtils.h"

namespace Jitrino {
namespace Ia32 {

/**
 * Replaces operations with 64 bits integer values with a set of operations
 * supported by IA-32 ISA.
 *
 * I8PseudoInstruction-s, normally generated by InstCodeSelector.
 */
class I8Lowerer : 
    public SessionAction, 
    protected OpndUtils, 
    protected SubCfgBuilderUtils 
{
    /**
     * Main entry point of this Action.
     */
    void runImpl(void);
private:
    typedef StlMap<Opnd *,Opnd **>  OPND_PAIRS_MAP;
    typedef StlVector<Inst*>        INST_ARRAY;
    typedef StlMap<Inst*, Node*>    INST_TO_NODE_MAP;
    //
    // virtuals
    //
    U_32 getNeedInfo(void) const
    {
        return NeedInfo_LoopInfo;
    }
    U_32 getSideEffects(void) const
    {
        // Simplest presumption - if we found at least one I8PseudoInst
        // we might affect everything, including liveness, loop info 
        // and dominator tree
        return foundI8Opnds;
    }
protected:
    /**
     * Dispatches instruction processing, basing on its Mnemonic.
     */
    void processOpnds(Inst * inst);
    /**
     * Splits the long operand up to the 2 32bits operands.
     */
    void prepareNewOpnds(Opnd * longOpnd, Opnd*& lowPartOpnd, Opnd*& hiPartOpnd);

    void buildShiftSubGraph(Inst * inst, Opnd * src1_1, Opnd * src1_2, Opnd * src2, Opnd * dst_1, Opnd * dst_2, Mnemonic mnem, Mnemonic opMnem);
    void buildComplexSubGraph(Inst * inst, Opnd * src1_1,Opnd * src1_2,Opnd * src2_1,Opnd * src2_2, Inst * condInst = NULL);
    void buildSetSubGraph(Inst * inst, Opnd * src1_1,Opnd * src1_2,Opnd * src2_1,Opnd * src2_2, Inst * condInst = NULL);
    void compareAndBranch(Inst * inst, Opnd * src1_1,Opnd * src1_2,Opnd * src2_1,Opnd * src2_2, Inst * condInst = NULL);
    /**
     * Processes I8 IMUL instruction.
     *
     * Depending on a flag, generates either call to internal helper, 
     * or generates and inserts sub-graph.
     *
     * @see inlineMul64
     */
    void lowerMul64(Inst* inst);
    /**
     * Processes I8 IDIV instruction.
     *
     * Depending on a flag, generates either call to internal helper, 
     * or generates and inserts sub-graph.
     */
    void lowerDiv64(Inst* inst);
    /**
     * Processes I8 remainder instruction.
     *
     * Depending on a flag, generates either call to internal helper, 
     * or generates and inserts sub-graph.
     */
    void lowerRem64(Inst* inst);
    /**
     * Generates sub-graph for multiplication of 2 64-bits long values.
     */
    void inlineMul64(Inst* inst);
    /**
     * Generates sub-graph for calculating quotient and remainder of 2 
     * 64 bits integer values, using integer instructions.
     */
    void inlineDivRem64(bool wantReminder, Inst* inst);
    /**
     * Tries to propagate and reuse results of inlined Div64 or Rem64 
     * instruction, to reduce code size and complex operations in the 
     * resulting CFG.
     */
    void propagateDivRemResults(Inst* originalInst, Node* originalInstNode, 
        Opnd* quot_lo, Opnd* quot_hi, Opnd* rem_lo, Opnd* rem_hi);

    U_32 foundI8Opnds;
private:
    /**
     * Tests whether the type is subject for lowering.
     */
    static bool isI8Type(Type * t)
    {
        return t->tag==Type::Int64 || t->tag==Type::UInt64;
    }
    /**
     * Tests whether the given Inst represents I8PseudoInstruction of 
     * remainder calculation.
     */
    static bool isI8RemInst(const Inst* inst) 
    {
        if (!inst->hasKind(Inst::Kind_I8PseudoInst)) {
            return false;
        }
        if (inst->getMnemonic() != Mnemonic_IDIV) {
            return false;
        }
        if (inst->getOpndCount()<=3) {
            return false;
        }
        // The additional fake parameter shows that in fact 
        // we need REMinder, not DIV. See InstCodeSelector
#ifdef _DEBUG
        Opnd* fakeFlag = inst->getOpnd(3);
        // Hard - coded values as they're hard coded in CodeSelector
        // Just to check no one started to use the I8PseudoInst with 
        // IDIV mnemonic for anything else.
        assert(isImm(fakeFlag) && fakeFlag->getImmValue()==12345678);
#endif
        return true;
    }
    /**
     * performs original mnemonic action on the new operands
     */
    void applyMnemonic(Inst* inst, Opnd* dst,  Opnd* dst_1,   Opnd* dst_2,
                                   Opnd* src1, Opnd* src1_1, Opnd* src1_2,
                                   Opnd* src2, Opnd* src2_1, Opnd* src2_2);
    /**
     * check if the opnd is a volatile field of long type
     */
    bool isLongVolatileMemoryOpnd(Opnd* opnd);

    /**
     * Points to the list of instructions to process.
     *
     * The pointer points to the local variable in runImpl();
     */
    INST_ARRAY*         m_pI8Insts;
    /**
     * Points to map of pairs 64 bits operand => two 32 bit operands.
     *
     * The pointer points to the local variable in runImpl();
     */
    OPND_PAIRS_MAP*     m_pairs;
    /**
     * Points to a map of Inst => header of the loop it belongs to.
     *
     * The pointer points to the local variable in runImpl();
     */
    INST_TO_NODE_MAP*   m_pLoopInfos;
};

static const char* help = 
"inline_mul64=true/false\n"
"   default=true. Inlines multiplication of 64 bits longs, instead of generating call to helper.\n"
"inline_mul64_checks=true/false\n"
"   default=false. Adds additional checks whether multipliers are indeed I_32 for simpler operation.\n"
"inline_div64=true/false\n"
"   default=true. Inlines division of 64 bits longs, instead of generating call to helper.\n"
"inline_rem64=true/false\n"
"   default=true. Inlines calculation of remainder of 64 bits longs, instead of generating call to helper.\n"
"propagate_div_rem=true/false\n"
"   default=true. Tries to match pairs 'A/B, A%B' and inline only single code sequence.\n"
"";

static ActionFactory <I8Lowerer> _i8l("i8l", help);

//_______________________________________________________________________________________________________________
// I8 operation internal helpers
int64 __stdcall imul64(const int64 src1, const int64 src2)  stdcall__;
int64 __stdcall imul64(const int64 src1, const int64 src2)
{   return src1*src2;   }

int64 __stdcall idiv64(const int64 src1, const int64 src2)  stdcall__;
int64 __stdcall idiv64(const int64 src1, const int64 src2)
{   return src1/src2;   }

int64 __stdcall irem64(const int64 src1, const int64 src2)  stdcall__;
int64 __stdcall irem64(const int64 src1, const int64 src2)
{   return src1%src2;   }


static void checkIR(IRManager* irm);

void I8Lowerer::runImpl() 
{
    // I8 operation internal helpers
    irManager->registerInternalHelperInfo("imul64", IRManager::InternalHelperInfo((void*)&imul64,&CallingConvention_STDCALL));
    irManager->registerInternalHelperInfo("idiv64", IRManager::InternalHelperInfo((void*)&idiv64,&CallingConvention_STDCALL));
    irManager->registerInternalHelperInfo("irem64", IRManager::InternalHelperInfo((void*)&irem64,&CallingConvention_STDCALL));
    
    // Initialize various xxUtils
    setIRManager(irManager);

    ControlFlowGraph* fg = irManager->getFlowGraph();

    MemoryManager memMgr("i8lowerer");

    StlMap<Opnd *,Opnd **> pairs(memMgr);
    m_pairs = &pairs;

    INST_ARRAY  i8Insts(memMgr);
    m_pI8Insts = &i8Insts;

    INST_TO_NODE_MAP loopInfos(memMgr);
    m_pLoopInfos = &loopInfos;

    irManager->calculateOpndStatistics();

    const Nodes* postOrder = &fg->getNodesPostOrder();
    const LoopTree* loopTree = fg->getLoopTree();

    //
    // 1. Walk the CFG, collect Inst-s to be processed. 
    // Also, count the defs [for I8] operands
    //
    for (Nodes::const_reverse_iterator it = postOrder->rbegin(), end = postOrder->rend(); it!=end; ++it) {
        Node* node = *it;
        if (!node->isBlockNode()) {
            continue;
        }
        for (Inst* inst = (Inst*)node->getFirstInst(); inst!=NULL; inst = inst->getNextInst()) {
            bool doProcess = 
                    inst->hasKind(Inst::Kind_I8PseudoInst) || 
                    inst->getMnemonic()==Mnemonic_CALL || 
                    inst->getMnemonic()==Mnemonic_RET ||
                    inst->hasKind(Inst::Kind_EntryPointPseudoInst) || 
                    inst->hasKind(Inst::Kind_AliasPseudoInst);
            if (doProcess) {
                i8Insts.push_back(inst);
                Node* loopHeader = loopTree->getLoopHeader(node, false);
                loopInfos[inst] = loopHeader;
            }
        }
    }

    // Will be used in propagateDivRem()
    computeDominators();

    for(StlVector<Inst *>::iterator it = i8Insts.begin(); it != i8Insts.end(); it++) {
        Inst* inst = *it;
        // Processes instructions get replaced with NULL
        if (inst != NULL) {
            processOpnds(inst);
            *it = NULL;
        }
    }

    fg->purgeEmptyNodes();
    fg->purgeUnreachableNodes();

    postOrder = &fg->getNodesPostOrder();
    for (Nodes::const_reverse_iterator it = postOrder->rbegin(), end = postOrder->rend(); it!=end; ++it) {
        Node * node= *it;
        if (!node->isBlockNode()) {
            continue;
        }
        Inst *  cdq = NULL;
        for (Inst* inst = (Inst*)node->getFirstInst(),*nextInst=NULL; inst!=NULL; inst = nextInst) {
            nextInst = inst->getNextInst();
            U_32  defCount = inst->getOpndCount(Inst::OpndRole_InstLevel|Inst::OpndRole_Def);
            U_32  useCount = inst->getOpndCount(Inst::OpndRole_InstLevel|Inst::OpndRole_Use);
            if(inst->getMnemonic() == Mnemonic_CDQ) {
                if (inst->getNextInst()!=NULL && inst->getNextInst()->getMnemonic() == Mnemonic_IDIV) {
                    continue;
                }
                cdq = inst;
            } else if ( cdq && inst->getMnemonic() == Mnemonic_AND &&
                isZeroImm(inst->getOpnd(defCount+1)) && 
                cdq->getOpnd(0)==inst->getOpnd(defCount)) {
                    Inst * tmpInst = irManager->newCopyPseudoInst(Mnemonic_MOV,inst->getOpnd(0), irManager->newImmOpnd(inst->getOpnd(0)->getType(),0));
                    tmpInst->insertAfter(inst);
                    inst->unlink();
                    inst = tmpInst;
                    cdq->unlink();
                    cdq = NULL;
            } else if ( inst->getMnemonic() == Mnemonic_AND &&
                        isImm(inst->getOpnd(defCount+1)) && 
                        inst->getOpnd(defCount+1)->getImmValue() == 0xFFFFFFFF) {
                Inst * tmpInst = irManager->newCopyPseudoInst(Mnemonic_MOV,inst->getOpnd(0), inst->getOpnd(defCount));
                tmpInst->insertAfter(inst);
                inst->unlink();
                inst = tmpInst;
            } else {
                if (cdq) {
                    Opnd* defCDQ = cdq->getOpnd(0);
                    for (U_32 i=defCount;i<defCount+useCount;i++) {
                        if (defCDQ == inst->getOpnd(i)) {
                            cdq = NULL;
                            break;
                        }
                    }
                }
            }        }
    }
    
    checkIR(irManager);
}

void I8Lowerer::processOpnds(Inst * inst)
{
    Opnd * newOp1 = NULL, *newOp2 = NULL;
    Opnd * dst_1 = NULL, * dst_2 = NULL, * src1_1 = NULL, * src1_2 = NULL, * src2_1 = NULL, * src2_2 = NULL;
    Mnemonic mn = inst->getMnemonic();
    if (mn==Mnemonic_CALL || 
        mn==Mnemonic_RET ||
        inst->hasKind(Inst::Kind_EntryPointPseudoInst) || 
        inst->hasKind(Inst::Kind_AliasPseudoInst)) {
        for(U_32 i = 0; i < inst->getOpndCount(); i++) {
            Opnd * opnd = inst->getOpnd(i);
            if (!isI8Type(opnd->getType())) {
                continue;
            }
            foundI8Opnds = ~(U_32)0;
            U_32 roles = inst->getOpndRoles(i);
            if (inst->hasKind(Inst::Kind_AliasPseudoInst)) {
                if (roles & Inst::OpndRole_Use) {
                    prepareNewOpnds(opnd,newOp1,newOp2);
                    inst->setOpnd(i, newOp1);
                    inst->insertOpnd(i+1, newOp2, roles);
                    inst->setConstraint(i, Constraint(OpndKind_Mem, OpndSize_32));
                    inst->setConstraint(i+1, Constraint(OpndKind_Mem, OpndSize_32));
                    i++;
                }
            } else {
                prepareNewOpnds(opnd,newOp1,newOp2);
                inst->setOpnd(i++, newOp1);
                inst->insertOpnd(i, newOp2, roles);
            }
        }
    } else if (inst->hasKind(Inst::Kind_I8PseudoInst)){
        U_32  defCount = inst->getOpndCount(Inst::OpndRole_InstLevel|Inst::OpndRole_Def),
                useCount = inst->getOpndCount(Inst::OpndRole_InstLevel|Inst::OpndRole_Use);
        Opnd * dst = defCount > 0 ? inst->getOpnd(0) : NULL;
        Opnd * src1 = useCount> 0 ? inst->getOpnd(defCount): NULL;
        Opnd * src2 = useCount> 1 ? inst->getOpnd(defCount+1): NULL;

        bool dstIsLongVolatile = false;

        if (mn!=Mnemonic_IDIV && mn!=Mnemonic_IMUL) {
            if (dst) {
                if(isLongVolatileMemoryOpnd(dst)) {
                    // temporary dst placed on stack will be used in the original instruction
                    dstIsLongVolatile = true;
                    Type* typeInt32 = irManager->getTypeManager().getInt32Type();
                    Type* typeUInt32 = irManager->getTypeManager().getUInt32Type();
                    dst_2 = irManager->newOpnd(typeInt32, Constraint(RegName_EDX));
                    dst_1 = irManager->newOpnd(typeUInt32, Constraint(RegName_EAX));
                } else {
                    prepareNewOpnds(dst,dst_1,dst_2);
                }
            }
            if (src1) {
                if(isLongVolatileMemoryOpnd(src1)) {
                    // prepare gp regs for CMPXCHG8B
                    Type* typeInt32 = irManager->getTypeManager().getInt32Type();
                    Type* typeUInt32 = irManager->getTypeManager().getUInt32Type();
                    Opnd* edx = irManager->newOpnd(typeInt32, Constraint(RegName_EDX));
                    Opnd* eax = irManager->newOpnd(typeUInt32, Constraint(RegName_EAX));
                    Opnd* ecx = irManager->newOpnd(typeInt32, Constraint(RegName_ECX));
                    Opnd* ebx = irManager->newOpnd(typeUInt32, Constraint(RegName_EBX));
                    Opnd* zero = irManager->newImmOpnd(typeInt32, 0);

                    irManager->newCopyPseudoInst(Mnemonic_MOV, edx, zero)->insertBefore(inst);
                    irManager->newCopyPseudoInst(Mnemonic_MOV, eax, zero)->insertBefore(inst);
                    irManager->newCopyPseudoInst(Mnemonic_MOV, ecx, edx)->insertBefore(inst);
                    irManager->newCopyPseudoInst(Mnemonic_MOV, ebx, eax)->insertBefore(inst);

                    // read src1 -> EDX:EAX regs using CMPXCHG8B inst
                    Inst* cmpxchg = irManager->newCMPXCHG8BPseudoInst(src1,edx,eax,ecx,ebx);
                    cmpxchg->setPrefix(InstPrefix_LOCK);
                    cmpxchg->insertBefore(inst);

                    // let src1_1:src1_2 be EAX:EDX 
                    src1_1 = eax;
                    src1_2 = edx;
                } else {
                    prepareNewOpnds(src1,src1_1,src1_2);
                }
            }
            if (src2) {
                if(isLongVolatileMemoryOpnd(src2)) {
                    // prepare gp regs for CMPXCHG8B
                    Type* typeInt32 = irManager->getTypeManager().getInt32Type();
                    Type* typeUInt32 = irManager->getTypeManager().getUInt32Type();
                    Opnd* edx = irManager->newOpnd(typeInt32, Constraint(RegName_EDX));
                    Opnd* eax = irManager->newOpnd(typeUInt32, Constraint(RegName_EAX));
                    Opnd* ecx = irManager->newOpnd(typeInt32, Constraint(RegName_ECX));
                    Opnd* ebx = irManager->newOpnd(typeUInt32, Constraint(RegName_EBX));
                    Opnd* zero = irManager->newImmOpnd(typeInt32, 0);

                    irManager->newCopyPseudoInst(Mnemonic_MOV, edx, zero)->insertBefore(inst);
                    irManager->newCopyPseudoInst(Mnemonic_MOV, eax, zero)->insertBefore(inst);
                    irManager->newCopyPseudoInst(Mnemonic_MOV, ecx, edx)->insertBefore(inst);
                    irManager->newCopyPseudoInst(Mnemonic_MOV, ebx, eax)->insertBefore(inst);

                    // read src2 -> EDX:EAX regs using CMPXCHG8B inst
                    Inst* cmpxchg = irManager->newCMPXCHG8BPseudoInst(src2,edx,eax,ecx,ebx);
                    cmpxchg->setPrefix(InstPrefix_LOCK);
                    cmpxchg->insertBefore(inst);
                    // let src2_1:src2_2 be EAX:EDX 
                    src2_1 = eax;
                    src2_2 = edx;
                } else {
                    prepareNewOpnds(src2,src2_1,src2_2);
                }
            }
        }

        applyMnemonic(inst,dst,dst_1,dst_2,src1,src1_1,src1_2,src2,src2_1,src2_2);

        if(dstIsLongVolatile) {
            // prepare gp regs for CMPXCHG8B
            Type* typeInt32 = irManager->getTypeManager().getInt32Type();
            Type* typeUInt32 = irManager->getTypeManager().getUInt32Type();
            Opnd* ecx = irManager->newOpnd(typeInt32, Constraint(RegName_ECX));
            Opnd* ebx = irManager->newOpnd(typeUInt32, Constraint(RegName_EBX));

            irManager->newCopyPseudoInst(Mnemonic_MOV, ecx, dst_2)->insertBefore(inst);
            irManager->newCopyPseudoInst(Mnemonic_MOV, ebx, dst_1)->insertBefore(inst);

            ControlFlowGraph * fg = irManager->getFlowGraph();
            Node* prevNode = inst->getNode();
            Node* nextNode = fg->splitNodeAtInstruction(inst, true, true, NULL);
            Node* loopNode = fg->createNode(Node::Kind_Block);
            Edge* prevOut = prevNode->getOutEdges().front();
            fg->replaceEdgeTarget(prevOut, loopNode);

            // write ECX:EBX -> dst using CMPXCHG8B inst
            Inst* cmpxchg = irManager->newCMPXCHG8BPseudoInst(dst,dst_2,dst_1,ecx,ebx);
            cmpxchg->setPrefix(InstPrefix_LOCK);
            loopNode->appendInst(cmpxchg);
            loopNode->appendInst(irManager->newBranchInst(Mnemonic_JNZ, loopNode, nextNode));

            fg->addEdge(loopNode, loopNode, 0.5);
            fg->addEdge(loopNode, nextNode, 0.5);
        }

        inst->unlink();
    }
}

bool I8Lowerer::isLongVolatileMemoryOpnd(Opnd* opnd) {

    if ( isI8Type(opnd->getType()) ) {
        Opnd* disp = opnd->isPlacedIn(OpndKind_Memory) ? opnd->getMemOpndSubOpnd(MemOpndSubOpndKind_Displacement) : NULL;
        Opnd::RuntimeInfo* ri = (disp == NULL) ? NULL : disp->getRuntimeInfo();
        Opnd::RuntimeInfo::Kind riKind = (ri == NULL) ? Opnd::RuntimeInfo::Kind_Null : ri->getKind();
        return (riKind == Opnd::RuntimeInfo::Kind_FieldOffset ||
                riKind == Opnd::RuntimeInfo::Kind_StaticFieldAddress) &&
               ((FieldDesc*)disp->getRuntimeInfo()->getValue(0))->isVolatile();
    } else {
        return false;
    }
}


void I8Lowerer::applyMnemonic(Inst* inst, Opnd* dst,  Opnd* dst_1,  Opnd* dst_2,
                                          Opnd* src1, Opnd* src1_1, Opnd* src1_2,
                                          Opnd* src2, Opnd* src2_1, Opnd* src2_2)
{
    Mnemonic mn = inst->getMnemonic();
    switch(mn) {
        case Mnemonic_ADD   :
            assert(dst_1 && src1_1 && src2_1);
            assert(dst_2 && src1_2 && src2_2);
            irManager->newInstEx(Mnemonic_ADD, 1, dst_1, src1_1, src2_1)->insertBefore(inst);
            irManager->newInstEx(Mnemonic_ADC, 1, dst_2, src1_2, src2_2)->insertBefore(inst);
            break;
        case Mnemonic_SUB   :
            assert(dst_1 && src1_1 && src2_1);
            assert(dst_2 && src1_2 && src2_2);
            irManager->newInstEx(Mnemonic_SUB, 1, dst_1, src1_1, src2_1)->insertBefore(inst);
            irManager->newInstEx(Mnemonic_SBB, 1, dst_2, src1_2, src2_2)->insertBefore(inst);
            break;
        case Mnemonic_AND   :
        case Mnemonic_OR    :
        case Mnemonic_XOR   :
            assert(dst_1 && src1_1 && src2_1);
            assert(dst_2 && src1_2 && src2_2);
        case Mnemonic_NOT   :
            assert(dst_1 && src1_1);
            assert(dst_2 && src1_2);
            irManager->newInstEx(mn, 1, dst_1, src1_1, src2_1)->insertBefore(inst);
            irManager->newInstEx(mn, 1, dst_2, src1_2, src2_2)->insertBefore(inst);
            break;
        case Mnemonic_MOV   :
            assert(dst_1 && src1_1);
            irManager->newCopyPseudoInst(Mnemonic_MOV, dst_1, src1_1)->insertBefore(inst);
            if (dst_2 && src1_2) {
                irManager->newCopyPseudoInst(Mnemonic_MOV, dst_2, src1_2)->insertBefore(inst);
            }
            break;
        case Mnemonic_MOVSX :
        case Mnemonic_MOVZX :
            assert(dst_1 && dst_2 && src1_1);
            assert(!src1_2);
            if (src1_1->getSize()<OpndSize_32){
                irManager->newInstEx(mn, 1, dst_1, src1_1)->insertBefore(inst);
            }else{
                assert(src1_1->getSize()==OpndSize_32);
                irManager->newInstEx(Mnemonic_MOV, 1, dst_1, src1_1)->insertBefore(inst);
            }
            if (mn==Mnemonic_MOVSX){
                // It's possible to substitute complex CDQ with a tight
                // constraints to the set of simpler instructions
                // with a wider constraints to let more freedom 
                // to regalloc and constraint resolver.
                // However, this seems does not change anything currently,
                // so leaving as-is.
                //test      low, low
                //setns     hi      ; if lo is positive, then load 1 into hi
                //sub       hi, 1   ; if lo is positive, then hi is now '0'. otherwise, it's -1
                irManager->newInstEx(Mnemonic_CDQ, 1, dst_2, dst_1)->insertBefore(inst);
            } else {
                //fill upper word with 0
                assert(mn == Mnemonic_MOVZX);
                Opnd* imm0=irManager->newImmOpnd(irManager->getTypeManager().getInt32Type(), 0);
                irManager->newInstEx(Mnemonic_MOV, 1, dst_2, imm0)->insertBefore(inst);
            }
            break;
        case Mnemonic_PUSH  :
            assert(src1_1);
            assert(src1_2);
            irManager->newInstEx(Mnemonic_PUSH, 0, src1_2)->insertBefore(inst);
            irManager->newInstEx(Mnemonic_PUSH, 0, src1_1)->insertBefore(inst);
            break;
        case Mnemonic_POP   :
            assert(dst_1);
            assert(dst_2);
            irManager->newInstEx(Mnemonic_POP, 1, dst_1)->insertBefore(inst);
            irManager->newInstEx(Mnemonic_POP, 1, dst_2)->insertBefore(inst);
            break;
        case Mnemonic_SHL   :
        {
            assert(dst && src1 && src2);
            buildShiftSubGraph(inst, src1_2, src1_1, src2, dst_2, dst_1, mn, Mnemonic_SHR);
            break;
        }
        case Mnemonic_SHR   :
        case Mnemonic_SAR   :
        {
            assert(dst && src1 && src2);
            buildShiftSubGraph(inst, src1_1, src1_2, src2, dst_1, dst_2, mn, Mnemonic_SHL);
            break;
        }
        case Mnemonic_CMP   :
        {
            assert(src1 && src2);
            Inst * condInst = inst->getNextInst();
            while (condInst && condInst->getMnemonic() == Mnemonic_MOV) {
                condInst = condInst->getNextInst();
            }
            Mnemonic mnem = condInst ? getBaseConditionMnemonic(condInst->getMnemonic()) : Mnemonic_NULL;
            if (mnem != Mnemonic_NULL) {
                        if(condInst->hasKind(Inst::Kind_BranchInst)) {
                            compareAndBranch(inst,src1_1,src1_2,src2_1,src2_2,condInst);
                        } else {
                            buildSetSubGraph(inst,src1_1,src1_2,src2_1,src2_2,condInst);
                        }
            } else {
                buildComplexSubGraph(inst,src1_1,src1_2,src2_1,src2_2);
            }
            break;
        }
        case Mnemonic_IMUL:
            lowerMul64(inst);
            break;
        case Mnemonic_IDIV:
            if (isI8RemInst(inst)) {
                lowerRem64(inst);
            }
            else {
                lowerDiv64(inst);
            }
            break;
        default :   
            assert(0);
    }
} // applyMnemonic

void I8Lowerer::buildShiftSubGraph(Inst * inst, Opnd * src1_1, Opnd * src1_2, Opnd * src2, Opnd * dst_1, Opnd * dst_2, Mnemonic mnem, Mnemonic opMnem) {
    Opnd * dst1_1 = irManager->newOpnd(dst_2->getType()), 
            * dst1_2 = irManager->newOpnd(dst_2->getType()), 
            * tmpSrc2 = irManager->newOpnd(src2->getType());

    if(src2->isPlacedIn(OpndKind_Imm)) {
        int64 immVal = src2->getImmValue();
        immVal &= 63;
        if (immVal == 32) {
            irManager->newCopyPseudoInst(Mnemonic_MOV, dst_1, src1_2)->insertBefore(inst);
            if (mnem != Mnemonic_SAR) {
                irManager->newCopyPseudoInst(Mnemonic_MOV, dst_2, irManager->newImmOpnd(src1_1->getType(), 0))->insertBefore(inst);
            } else {
                irManager->newInstEx(mnem, 1, dst_2, src1_2, irManager->newImmOpnd(src2->getType(), 31))->insertBefore(inst);
            }
        } else if(immVal > 31) {
            if (mnem != Mnemonic_SAR) {
                irManager->newCopyPseudoInst(Mnemonic_MOV, dst_2, irManager->newImmOpnd(src1_1->getType(), 0))->insertBefore(inst);
            } else {
                irManager->newInstEx(mnem, 1, dst_2, src1_2, irManager->newImmOpnd(src2->getType(), 31))->insertBefore(inst);
            }
            irManager->newInstEx(mnem, 1, dst_1, src1_2, src2)->insertBefore(inst);
        } else if (immVal == 0) {
            irManager->newCopyPseudoInst(Mnemonic_MOV, dst_2, src1_2)->insertBefore(inst);
            irManager->newCopyPseudoInst(Mnemonic_MOV, dst_1, src1_1)->insertBefore(inst);
        } else {
            irManager->newInstEx(mnem == Mnemonic_SAR?Mnemonic_SHR:mnem, 1, dst1_1, src1_1, src2)->insertBefore(inst);
            irManager->newInstEx(opMnem, 1, dst1_2, src1_2, irManager->newImmOpnd(src2->getType(), 32-immVal))->insertBefore(inst);
            irManager->newInstEx(Mnemonic_OR, 1, dst_1, dst1_2, dst1_1)->insertBefore(inst);
            irManager->newInstEx(mnem, 1, dst_2, src1_2, src2)->insertBefore(inst);
        }
    } else {
        ControlFlowGraph* subCFG = irManager->createSubCFG(true, false);

        Node* bbMain = subCFG->getEntryNode(),
                    * lNode = subCFG->createBlockNode(),
                    * gNode =  subCFG->createBlockNode(),
                    * nullNode =  subCFG->createBlockNode(),
                    * cmpNullNode = subCFG->createBlockNode();

        bbMain->appendInst(irManager->newInstEx(Mnemonic_AND, 1, src2, src2, irManager->newImmOpnd(src2->getType(), 63)));
        bbMain->appendInst(irManager->newInst(Mnemonic_CMP, src2, irManager->newImmOpnd(src2->getType(), 31)));
        bbMain->appendInst(irManager->newBranchInst(getMnemonic(Mnemonic_Jcc, ConditionMnemonic_LE), cmpNullNode, gNode));

        cmpNullNode->appendInst(irManager->newInst(Mnemonic_CMP, src2, irManager->newImmOpnd(src2->getType(), 0)));
        cmpNullNode->appendInst(irManager->newBranchInst(getMnemonic(Mnemonic_Jcc, ConditionMnemonic_E), nullNode, lNode));

        nullNode->appendInst(irManager->newCopyPseudoInst(Mnemonic_MOV, dst_1, src1_1));
        nullNode->appendInst(irManager->newCopyPseudoInst(Mnemonic_MOV, dst_2, src1_2));

        if (mnem == Mnemonic_SAR)
            gNode->appendInst(irManager->newInstEx(mnem, 1, dst_2, src1_2, irManager->newImmOpnd(src2->getType(), 31)));
        else
            gNode->appendInst(irManager->newCopyPseudoInst(Mnemonic_MOV, dst_2, irManager->newImmOpnd(dst_2->getType(), 0)));
        gNode->appendInst(irManager->newInstEx(mnem == Mnemonic_SAR?Mnemonic_SHR:mnem, 1, dst_1, src1_2, src2));

        lNode->appendInst(irManager->newInstEx(mnem == Mnemonic_SAR?Mnemonic_SHR:mnem, 1, dst1_1, src1_1, src2));
        lNode->appendInst(irManager->newInstEx(Mnemonic_SUB, 1, tmpSrc2, irManager->newImmOpnd(src2->getType(), 32), src2));
        lNode->appendInst(irManager->newInstEx(opMnem, 1, dst1_2, src1_2, tmpSrc2));
        lNode->appendInst(irManager->newInstEx(Mnemonic_OR, 1, dst_1, dst1_2, dst1_1));
        lNode->appendInst(irManager->newInstEx(mnem, 1, dst_2, src1_2, src2));
        
        Node* sinkNode =  subCFG->getReturnNode();
        
        subCFG->addEdge(bbMain, cmpNullNode, 0.5);
        subCFG->addEdge(bbMain, gNode, 0.5);
        subCFG->addEdge(cmpNullNode, nullNode, 0.5);
        subCFG->addEdge(cmpNullNode, lNode, 0.5);
        subCFG->addEdge(gNode, sinkNode, 1);
        subCFG->addEdge(lNode, sinkNode, 1);
        subCFG->addEdge(nullNode, sinkNode, 1);
        
        irManager->getFlowGraph()->spliceFlowGraphInline(inst, *subCFG);
    }
}

void I8Lowerer::compareAndBranch(Inst * inst, Opnd * src1_1,Opnd * src1_2,Opnd * src2_1,Opnd * src2_2, Inst * condInst) {
    ControlFlowGraph* cfg = irManager->getFlowGraph();

    Node * bb = inst->getNode();
    Edge* dbEdge = bb->getTrueEdge();
    Edge* ftEdge = bb->getFalseEdge();
    Node* bbDB = dbEdge->getTargetNode();
    Node* bbFT = ftEdge->getTargetNode();
    Mnemonic mn = condInst->getMnemonic();
    
    //for == and != algorithms checks low parts first
    if (mn == Mnemonic_JNZ) { // !=
        Node* cmpHiNode = cfg->createBlockNode();
        cfg->replaceEdgeTarget(ftEdge, cmpHiNode, true);
        
        //if (lo1!=lo2) return true;
        bb->appendInst(irManager->newInst(Mnemonic_CMP, src1_1, src2_1));        
        bb->appendInst(irManager->newBranchInst(Mnemonic_JNZ, bbDB, cmpHiNode));

        //if (hi1!=hi2) return true;
        cmpHiNode->appendInst(irManager->newInst(Mnemonic_CMP, src1_2, src2_2));
        cmpHiNode->appendInst(irManager->newBranchInst(Mnemonic_JNZ, bbDB, bbFT));
        
        cfg->addEdge(cmpHiNode, bbDB, dbEdge->getEdgeProb());
        cfg->addEdge(cmpHiNode, bbFT, ftEdge->getEdgeProb());
    } else if (mn == Mnemonic_JZ) { // ==
        Node* cmpHiNode = cfg->createBlockNode();
        cfg->replaceEdgeTarget(dbEdge, cmpHiNode, true);
       
        //if lo1==lo2 check hi parts
        bb->appendInst(irManager->newInst(Mnemonic_CMP, src1_1, src2_1));        
        bb->appendInst(irManager->newBranchInst(Mnemonic_JZ, cmpHiNode, bbFT));

        //if hi1==hi2 -> return true
        cmpHiNode->appendInst(irManager->newInst(Mnemonic_CMP, src1_2, src2_2));
        cmpHiNode->appendInst(irManager->newBranchInst(Mnemonic_JZ, bbDB, bbFT));

        cfg->addEdge(cmpHiNode, bbDB, dbEdge->getEdgeProb());
        cfg->addEdge(cmpHiNode, bbFT, ftEdge->getEdgeProb());
    } else { // >=, >, <=, <
        //if hi parts equals -> compare low parts
        //else compare hi parts
        Node* cmpHiNode = cfg->createBlockNode();
        Node* cmpLoNode = cfg->createBlockNode();

        //check if hi parts are equal
        cfg->replaceEdgeTarget(dbEdge, cmpHiNode, true);
        cfg->replaceEdgeTarget(ftEdge, cmpLoNode, true);
        bb->appendInst(irManager->newInst(Mnemonic_CMP, src1_2, src2_2));        
        bb->appendInst(irManager->newBranchInst(Mnemonic_JNE, cmpHiNode, cmpLoNode));

        //check hi parts: reuse old CMP result here
        cmpHiNode->appendInst(irManager->newBranchInst(mn, bbDB, bbFT));
        cfg->addEdge(cmpHiNode, bbDB, dbEdge->getEdgeProb());
        cfg->addEdge(cmpHiNode, bbFT, ftEdge->getEdgeProb());
        
        //check lo parts, use unsigned comparison
        Mnemonic unsignedMn = Mnemonic_NULL;
        switch(mn) {
            case Mnemonic_JL  : unsignedMn = Mnemonic_JB; break;
            case Mnemonic_JLE : unsignedMn = Mnemonic_JBE; break;
            case Mnemonic_JG  : unsignedMn = Mnemonic_JA; break;
            case Mnemonic_JGE : unsignedMn = Mnemonic_JAE; break;
            default: unsignedMn = mn; break;
        }
        cmpLoNode->appendInst(irManager->newInst(Mnemonic_CMP, src1_1, src2_1));        
        cmpLoNode->appendInst(irManager->newBranchInst(unsignedMn, bbDB, bbFT));
        cfg->addEdge(cmpLoNode, bbDB, dbEdge->getEdgeProb());
        cfg->addEdge(cmpLoNode, bbFT, ftEdge->getEdgeProb());
    }
    condInst->unlink();
}


void I8Lowerer::buildSetSubGraph(Inst * inst, Opnd * src1_1,Opnd * src1_2,Opnd * src2_1,Opnd * src2_2, Inst * condInst) {

    ControlFlowGraph* subCFG = irManager->createSubCFG(true, false);
    
    Node* bbHMain = subCFG->getEntryNode();
    Node* bbHF = subCFG->createBlockNode();
    Node* bbLMain = subCFG->createBlockNode();
    Node* sinkNode =  subCFG->getReturnNode();

    bbHMain->appendInst(irManager->newInst(Mnemonic_CMP, src1_2, src2_2));
    bbHMain->appendInst(irManager->newBranchInst(Mnemonic_JNZ, bbHF, bbLMain));

    bbLMain->appendInst(irManager->newInst(Mnemonic_CMP, src1_1, src2_1));
    
    irManager->getFlowGraph()->splitNodeAtInstruction(condInst, true, true, NULL);
    
    Mnemonic mnem = getBaseConditionMnemonic(condInst->getMnemonic());

    Inst * nextInst = inst->getNextInst();
    if(nextInst->getMnemonic() == Mnemonic_MOV) {
        bbHF->appendInst(irManager->newCopyPseudoInst(Mnemonic_MOV, nextInst->getOpnd(0), nextInst->getOpnd(1)));
        bbLMain->appendInst(irManager->newCopyPseudoInst(Mnemonic_MOV, nextInst->getOpnd(0), nextInst->getOpnd(1)));
        nextInst->unlink();
    } else {
        assert(nextInst == condInst);
    }
    bbHF->appendInst(irManager->newInst(condInst->getMnemonic(), condInst->getOpnd(0), mnem == Mnemonic_CMOVcc? condInst->getOpnd(1): NULL));
    
    ConditionMnemonic condMnem = ConditionMnemonic(condInst->getMnemonic()-mnem);
    switch(condMnem) {
        case ConditionMnemonic_L : condMnem = ConditionMnemonic_B; break;
        case ConditionMnemonic_LE : condMnem = ConditionMnemonic_BE; break;
        case ConditionMnemonic_G : condMnem = ConditionMnemonic_A; break;
        case ConditionMnemonic_GE : condMnem = ConditionMnemonic_AE; break;
        default: break;
    }
    bbLMain->appendInst(irManager->newInst(Mnemonic(mnem+condMnem), condInst->getOpnd(0), mnem == Mnemonic_CMOVcc? condInst->getOpnd(1): NULL));
    
    subCFG->addEdge(bbHMain, bbHF, 0.1);
    subCFG->addEdge(bbHMain, bbLMain, 0.9);
    subCFG->addEdge(bbLMain, sinkNode, 1); 
    subCFG->addEdge(bbHF, sinkNode, 1);

    condInst->unlink();

    irManager->getFlowGraph()->spliceFlowGraphInline(inst, *subCFG);
}

void I8Lowerer::buildComplexSubGraph(Inst * inst, Opnd * src1_1,Opnd * src1_2,Opnd * src2_1,Opnd * src2_2, Inst * condInst) {

    Opnd * dst_1 = irManager->newOpnd(irManager->getTypeManager().getInt32Type());
    ControlFlowGraph* subCFG = irManager->createSubCFG(true, false);
    Node* bbHMain = subCFG->getEntryNode();
    Node* bbHF = subCFG->createBlockNode();
    Node* bbHS = subCFG->createBlockNode();

    Node* bbLMain = subCFG->createBlockNode();
    Node* bbLF = subCFG->createBlockNode();
    Node* bbLS = subCFG->createBlockNode();
    Node* sinkNode =  subCFG->getReturnNode();

    bbHMain->appendInst(irManager->newCopyPseudoInst(Mnemonic_MOV, dst_1, irManager->newImmOpnd(irManager->getTypeManager().getInt32Type(), 0)));
    bbHMain->appendInst(irManager->newInst(Mnemonic_CMP, src1_2, src2_2));
    bbHMain->appendInst(irManager->newBranchInst(Mnemonic_JZ, bbLMain, bbHF));
    
    bbHF->appendInst(irManager->newCopyPseudoInst(Mnemonic_MOV, dst_1, irManager->newImmOpnd(irManager->getTypeManager().getInt32Type(), -1)));
    bbHF->appendInst(irManager->newBranchInst(Mnemonic_JG, bbHS, sinkNode));
    
    bbHS->appendInst(irManager->newCopyPseudoInst(Mnemonic_MOV, dst_1, irManager->newImmOpnd(irManager->getTypeManager().getInt32Type(), 1)));
    
    bbLMain->appendInst(irManager->newInst(Mnemonic_CMP, src1_1, src2_1));
    bbLMain->appendInst(irManager->newBranchInst(Mnemonic_JZ, sinkNode, bbLF));

    bbLF->appendInst(irManager->newCopyPseudoInst(Mnemonic_MOV, dst_1, irManager->newImmOpnd(irManager->getTypeManager().getInt32Type(), -1)));
    bbLF->appendInst(irManager->newBranchInst(Mnemonic_JA, bbLS, sinkNode));
    
    bbLS->appendInst(irManager->newCopyPseudoInst(Mnemonic_MOV, dst_1, irManager->newImmOpnd(irManager->getTypeManager().getInt32Type(), 1)));
    
    sinkNode->appendInst(irManager->newInst(Mnemonic_CMP, dst_1, irManager->newImmOpnd(irManager->getTypeManager().getInt32Type(), 0)));
    
    
    subCFG->addEdge(bbHMain, bbHF, 0.1);
    subCFG->addEdge(bbHMain, bbLMain, 0.9);
    subCFG->addEdge(bbHF, sinkNode, 0.5);
    subCFG->addEdge(bbHF, bbHS, 0.5);
    subCFG->addEdge(bbHS, sinkNode, 1); 
    subCFG->addEdge(bbLMain, bbLF, 0.1);
    subCFG->addEdge(bbLMain, sinkNode, 0.9);
    subCFG->addEdge(bbLF, sinkNode, 0.5);
    subCFG->addEdge(bbLF, bbLS, 0.5);
    subCFG->addEdge(bbLS, sinkNode, 1);
    
    irManager->getFlowGraph()->spliceFlowGraphInline(inst, *subCFG);
}

void I8Lowerer::prepareNewOpnds(Opnd * longOpnd, Opnd*& newOp1, Opnd*& newOp2)
{
    if (!isI8Type(longOpnd->getType())){
        newOp1=longOpnd;
        newOp2=NULL;
        return;
    }

    if(m_pairs->find(longOpnd)!=m_pairs->end()) {
        newOp1 = (*m_pairs)[longOpnd][0];
        newOp2 = (*m_pairs)[longOpnd][1];
    } else {
        if(longOpnd->isPlacedIn(OpndKind_Memory)) {
            newOp1 = irManager->newOpnd(irManager->getTypeManager().getUInt32Type());
            newOp2 = irManager->newOpnd(irManager->getTypeManager().getInt32Type());
            Opnd * opnds[2] = { newOp1, newOp2 };
            irManager->assignInnerMemOpnds(longOpnd, opnds, 2);
        } else if(longOpnd->isPlacedIn(OpndKind_Imm)){
            newOp1 = irManager->newImmOpnd(irManager->getTypeManager().getUInt32Type(), (U_32)longOpnd->getImmValue());
            newOp2 = irManager->newImmOpnd(irManager->getTypeManager().getInt32Type(), (I_32)(longOpnd->getImmValue()>>32));
        } else {
            newOp1 = irManager->newOpnd(irManager->getTypeManager().getUInt32Type());
            newOp2 = irManager->newOpnd(irManager->getTypeManager().getInt32Type());
        }
        (*m_pairs)[longOpnd] = new(irManager->getMemoryManager()) Opnd*[2];
        (*m_pairs)[longOpnd][0]=newOp1;
        (*m_pairs)[longOpnd][1]=newOp2;
    }
}

void I8Lowerer::lowerMul64(Inst* inst)
{
    assert(inst->getOpndRoles(0) & Inst::OpndRole_Def);
    assert(inst->getOpndRoles(1) & Inst::OpndRole_Use);
    assert(inst->getOpndRoles(2) & Inst::OpndRole_Use);
    Opnd* dst = inst->getOpnd(0);
    Opnd* src1 = inst->getOpnd(1);
    Opnd* src2 = inst->getOpnd(2);
    assert(dst && src1 && src2);

    if ( isZeroImm(src1) || isZeroImm(src2)) {
        // Multiplue by zero - noop.
        inst->unlink();
        return;
    }

    bool inline_mul64 = true;
    getArg("inline_mul64", inline_mul64);

    if (!inline_mul64) {
        Opnd * args[2]={ src1, src2 };
        CallInst * callInst = irManager->newInternalRuntimeHelperCallInst("imul64", 2, args, dst);
        callInst->insertBefore(inst);
        processOpnds(callInst);
        inst->unlink();
        return;
    }

    inlineMul64(inst);
}

void I8Lowerer::inlineMul64(Inst* inst)
{
    assert(inst->getOpndRoles(0) & Inst::OpndRole_Def);
    assert(inst->getOpndRoles(1) & Inst::OpndRole_Use);
    assert(inst->getOpndRoles(2) & Inst::OpndRole_Use);
    Opnd* dst = inst->getOpnd(0);
    Opnd* src1 = inst->getOpnd(1);
    Opnd* src2 = inst->getOpnd(2);

    bool inline_mul64_checks = false;
    getArg("inline_mul64_checks", inline_mul64_checks);

    Opnd * dst_1 = NULL, * dst_2 = NULL;
    Opnd * src1_1 = NULL, * src1_2 = NULL;
    Opnd * src2_1 = NULL, * src2_2 = NULL;

    prepareNewOpnds(dst, dst_1, dst_2);
    prepareNewOpnds(src1, src1_1, src1_2);
    prepareNewOpnds(src2, src2_1, src2_2);

    TypeManager& tm = irManager->getTypeManager();
    Type* int32type = tm.getInt32Type();
    // Name them eax, ecx, edx here, in code only, to refer same regs as in 
    // the comments, but let the register allocator to decide the proper 
    // ones.
    Opnd*eax;
    Opnd*edx;
    Opnd*ecx;

    eax = irManager->newOpnd(int32type);
    edx = irManager->newOpnd(int32type);
    ecx = irManager->newOpnd(int32type);

    Opnd* zero = irManager->newImmOpnd(int32type, 0);
    
    //
    Opnd* a_lo = src1_1;
    Opnd* a_hi = src1_2;
    Opnd* b_lo = src2_1;
    Opnd* b_hi = src2_2;
    Opnd* res_lo = dst_1;
    Opnd* res_hi = dst_2;
    
    /*
    Strategy:
    result = a*b ; 
        a = a_lo + a_hi*(1<<32)
        b = b_lo + b_hi*(1<<32)
    - a*b = (a_lo + a_hi*(1<<32))*(b_lo + b_hi*(1<<32)) = 
          a_lo*b_lo + a_hi*(1<<32)*b_lo + a_lo*b_hi*(1<<32) + a_hi*(1<<32)*b_hi*(1<<32)
    
    - a_hi*(1<<32)*b_hi*(1<<32) - can be dropped off, it's overflow
    - with any of `a_hi*b_lo*(1<<32)` + `a_lo*b_hi*(1<<32)` we can ignore high 32 bits 
        of result (EDX) - these are overflow again 
    So, we're getting:

    a_lo*b_lo + a_hi*b_lo*(1<<32) + a_lo*b_hi*(1<<32)
    (1)         (2)             (3)
    */
    //
    if (inline_mul64_checks) {
        newSubGFG();
        Node* entryNode = getSubCfgEntryNode();
    
        Node* longMulNode = newBB();
        Node* storeResultNode = newBB();
        setCurrentNode(NULL); 
    
        Node* test_A_Node = newBB();
        Node* test_B_Node = newBB();
        Node* simpleMulNode = newBB();
        setCurrentNode(NULL);
        
        connectNodes(entryNode, test_A_Node);
        setCurrentNode(test_A_Node);
        //
        // test_A_Node:
        //
        /* cmp a_hi, 0*/    newInst(Mnemonic_CMP, a_hi, zero);
        /* jnz  longMul */  newBranch(Mnemonic_JNZ, longMulNode, test_B_Node);
        setCurrentNode(NULL);
        test_A_Node = (Node*)0xDEADBEEF;
        
        //
        // test_B_Node:
        //
        setCurrentNode(test_B_Node);
        /* cmp b_hi, 0*/    newInst(Mnemonic_CMP, b_hi, zero);
        /* jnz  longMul */  newBranch(Mnemonic_JNZ, longMulNode, simpleMulNode);
        setCurrentNode(NULL);
        test_B_Node = (Node*)0xDEADBEEF;
        
        //
        // simpleMulNode:
        //
        setCurrentNode(simpleMulNode);
        /* mov eax, a_lo */ newInst(Mnemonic_MOV, eax, a_lo);
        /* imul b_lo */     newInst(Mnemonic_MUL, 2, edx, eax, eax, b_lo);
        connectNodeTo(storeResultNode);
        setCurrentNode(NULL);
        simpleMulNode = (Node*)0xDEADBEEF;

        entryNode = (Node*)0xDEADBEEF;
    
        setCurrentNode(longMulNode);
        /* mov eax, a_lo*/  newInst(Mnemonic_MOV, eax, a_lo);
        /* mul     b_hi */  newInst(Mnemonic_MUL, 2, edx, eax, eax, b_hi);
        //
        // Now, EAX=a_lo*b_hi, EDX content is dropped
        //
                            // save a_lo*b_hi(EAX) 
        /* mov ecx, eax */  newInst(Mnemonic_MOV, ecx, eax);
        /* mov eax, a_hi*/  newInst(Mnemonic_MOV, eax, a_hi);
        /* mul b_lo     */  newInst(Mnemonic_MUL, 2, edx, eax, eax, b_lo);
        /* add ecx, eax */  newInst(Mnemonic_ADD, 1, ecx, ecx, eax);

        /* mov eax, a_lo */ newInst(Mnemonic_MOV, eax, a_lo);
        /* mul b_lo      */ newInst(Mnemonic_MUL, 2, edx, eax, eax, b_lo);
        /* add edx, ecx */  newInst(Mnemonic_ADD, 1, edx, edx, ecx);
        connectNodes(longMulNode, storeResultNode);
        setCurrentNode(NULL);
        longMulNode = (Node*)0xDEADBEEF;

        //
        // storeResultNode:
        //
        setCurrentNode(storeResultNode);
        /* mov res_lo, eax*/    newInst(Mnemonic_MOV, res_lo, eax);
        /* mov res_hi, edx*/    newInst(Mnemonic_MOV, res_hi, edx);
        setCurrentNode(NULL);
        connectNodes(storeResultNode, getSubCfgReturnNode());

        propagateSubCFG(inst);
    } else {
        /* mov eax, a_lo */ irManager->newCopyPseudoInst(Mnemonic_MOV, eax, a_lo)->insertBefore(inst);
        /* mul     b_hi */  irManager->newInstEx(Mnemonic_MUL, 2, edx, eax, eax, b_hi, NULL)->insertBefore(inst);
        // Now, EAX=low32bit(a_lo*b_hi), EDX content is dropped

        /* mov ecx, eax */  irManager->newCopyPseudoInst(Mnemonic_MOV, ecx, eax)->insertBefore(inst); 
        // Now, a_lo*b_hi is saved to ECX
        
        /* mov eax, a_hi*/  irManager->newCopyPseudoInst(Mnemonic_MOV, eax, a_hi)->insertBefore(inst);
        /* mul b_lo     */  irManager->newInstEx(Mnemonic_MUL, 2, edx, eax, eax, b_lo, NULL)->insertBefore(inst);
        // Now, EAX=low32bit(a_hi*b_lo), EDX content is dropped
        
        /* add ecx, eax */  irManager->newInstEx(Mnemonic_ADD, 1, ecx, ecx, eax)->insertBefore(inst);
        // Now, ECX=low32bit(a_lo*b_hi+a_hi*b_lo)

        /* mov eax, a_lo */ irManager->newCopyPseudoInst(Mnemonic_MOV, eax, a_lo)->insertBefore(inst);
        /* mul b_lo      */ irManager->newInstEx(Mnemonic_MUL, 2, edx, eax, eax, b_lo, NULL)->insertBefore(inst);
        // Now, EAX=low32bit(a_lo*b_lo) and EDX=high32bit(a_lo*b_lo)
        
        /* add edx, ecx */  irManager->newInstEx(Mnemonic_ADD, 1, edx, edx, ecx)->insertBefore(inst);
        // Now, EDX=low32bit(a_lo*b_hi+a_hi*b_lo)+high32bit(a_lo*b_lo)

        /* mov res_lo, eax*/    irManager->newCopyPseudoInst(Mnemonic_MOV, res_lo, eax)->insertBefore(inst);
        /* mov res_hi, edx*/    irManager->newCopyPseudoInst(Mnemonic_MOV, res_hi, edx)->insertBefore(inst);
    }
}

void I8Lowerer::lowerDiv64(Inst* inst)
{
    assert(inst->getOpndRoles(0) & Inst::OpndRole_Def);
    assert(inst->getOpndRoles(1) & Inst::OpndRole_Use);
    assert(inst->getOpndRoles(2) & Inst::OpndRole_Use);
    Opnd* dst = inst->getOpnd(0);
    Opnd* src1 = inst->getOpnd(1);
    Opnd* src2 = inst->getOpnd(2);
    assert(dst && src1 && src2);
    
    bool inline_div64 = true;
    getArg("inline_div64", inline_div64);

    if (!inline_div64) {
        Opnd * args[2]={ src1, src2 };
        CallInst * callInst = irManager->newInternalRuntimeHelperCallInst("idiv64", 2, args, dst);
        callInst->insertBefore(inst);
        processOpnds(callInst);
        inst->unlink();
        return;
    }
    inlineDivRem64(false, inst);
}

void I8Lowerer::lowerRem64(Inst* inst)
{
    assert(inst->getOpndRoles(0) & Inst::OpndRole_Def);
    assert(inst->getOpndRoles(1) & Inst::OpndRole_Use);
    assert(inst->getOpndRoles(2) & Inst::OpndRole_Use);
    Opnd* dst = inst->getOpnd(0);
    Opnd* src1 = inst->getOpnd(1);
    Opnd* src2 = inst->getOpnd(2);
    assert(dst && src1 && src2);

    bool inline_rem64 = true;
    getArg("inline_rem64", inline_rem64);

    if (!inline_rem64) {
        Opnd * args[2]={ src1, src2 };
        CallInst * callInst = irManager->newInternalRuntimeHelperCallInst("irem64", 2, args, dst);
        callInst->insertBefore(inst);
        processOpnds(callInst);
        inst->unlink();
        return;
    }
    inlineDivRem64(true, inst);
}

void I8Lowerer::inlineDivRem64(bool wantReminder, Inst* inst)
{
    assert(inst->getOpndRoles(0) & Inst::OpndRole_Def);
    assert(inst->getOpndRoles(1) & Inst::OpndRole_Use);
    assert(inst->getOpndRoles(2) & Inst::OpndRole_Use);
    Opnd* dst = inst->getOpnd(0);
    Opnd* src1 = inst->getOpnd(1);
    Opnd* src2 = inst->getOpnd(2);
    
    Opnd * res_lo = NULL, * res_hi = NULL;
    Opnd * a_lo = NULL, * a_hi = NULL;
    Opnd * b_lo = NULL, * b_hi = NULL;

    prepareNewOpnds(dst,  res_lo, res_hi);
    prepareNewOpnds(src1, a_lo, a_hi);
    prepareNewOpnds(src2, b_lo, b_hi);

    //
    TypeManager& tm = irManager->getTypeManager();
    Type* int32type = tm.getInt32Type();
    
    Opnd* edx = irManager->newOpnd(int32type);
    Opnd* eax = irManager->newOpnd(int32type);
    Opnd* ecx = irManager->newOpnd(int32type);
    Opnd* ebx = irManager->newOpnd(int32type);
    Opnd* edi = irManager->newOpnd(int32type);
    Opnd* esi = irManager->newOpnd(int32type);

    Opnd* temp_lo = irManager->newOpnd(int32type);
    Opnd* temp_hi = irManager->newOpnd(int32type);
    Opnd* zero = irManager->newImmOpnd(int32type, 0);
    Opnd* one = irManager->newImmOpnd(int32type, 1);
    //

    newSubGFG();
    
    Node* entryNode = getSubCfgEntryNode();
    Node* check_A_SignNode = newBB();
    Node* neg_A_Node = newBB();
    Node* check_B_SignNode = newBB();
    Node* neg_B_Node = newBB();
    
    Node* testBigDvsrNode = newBB();
    Node* testOneDivNode = newBB();
    Node* simpleDivNode = newBB();
    Node* oneDivNode = newBB();
    Node* bigDivisorNode = newBB();
    Node* storeResultNode = newBB();
    
    //
    // Here we go.
    // The algorithm is based on unsigned div algo from assembly gems page
    // http://www.df.lth.se/~john_e/gems/gem002a.html (also published in AMD 
    // optimization manual)  and modified for signed arithmetics and for 
    // the Jitrino CG specifics.
    //

    //
    // entryNode:
    //
    // Preparation - load all into 'registers'
    //

    Opnd* fixQuotSign = irManager->newOpnd(int32type);
    Opnd* fixRemSign = irManager->newOpnd(int32type);
                        // Sign presumed positive
    setCurrentNode(entryNode);
    /* mov fixQuotSign, 0*/ newInst(Mnemonic_MOV, fixQuotSign, zero);
    /* mov fixRemSign, 0*/  newInst(Mnemonic_MOV, fixRemSign, zero);
    
    /* mov edx, a_hi*/  newInst(Mnemonic_MOV, edx, a_hi);
    /* mov eax, a_lo*/  newInst(Mnemonic_MOV, eax, a_lo);
    /* mov ecx, b_hi*/  newInst(Mnemonic_MOV, ecx, b_hi);
    /* mov ebx, b_lo*/  newInst(Mnemonic_MOV, ebx, b_lo);
    connectNodeTo(check_A_SignNode);
    setCurrentNode(NULL);
    entryNode = (Node*)0xDEADBEEF;

    //
    // check_A_SignNode:
    //
    // Test for signs and convert to unsigned when needed
    // 

    setCurrentNode(check_A_SignNode);
    /* test edx, edx*/ newInst(Mnemonic_TEST, edx, edx);
    /* jns  check_B_Sign*/ newBranch(Mnemonic_JNS, check_B_SignNode, neg_A_Node);
    setCurrentNode(NULL);
    check_A_SignNode = (Node*)0xDEADBEEF;

    //
    // neg_A:
    //
    //

    setCurrentNode(neg_A_Node);
    /* mov fixQuotSign, 1*/ newInst(Mnemonic_MOV, fixQuotSign, one);
    /* mov fixRemSign, 1*/  newInst(Mnemonic_MOV, fixRemSign, one);
    /* neg eax      */  newInst(Mnemonic_NEG, 1, eax, eax);
    /* adc edx, 0   */  newInst(Mnemonic_ADC, 1, edx, edx, zero);
    /* neg edx      */  newInst(Mnemonic_NEG, 1, edx, edx);
    connectNodeTo(check_B_SignNode);
    setCurrentNode(NULL);
    neg_A_Node = (Node*)0xDEADBEEF;

    //
    // check_B_Sign_Node:
    //
    setCurrentNode(check_B_SignNode);
    /* test ecx, ecx*/  newInst(Mnemonic_TEST, ecx, ecx);
    /* jns  testBigDvsrNode*/ newBranch(Mnemonic_JNS, testBigDvsrNode, neg_B_Node);
    setCurrentNode(NULL);
    check_B_SignNode = (Node*)0xDEADBEEF;

    //
    // neg_B_Node:
    //
    // When doing REM, the result's sing only depends on 
    // dividend's sign no need to change fixRemSign
    
    setCurrentNode(neg_B_Node);
    /* XOR fixQuotSign, 1*/  newInst(Mnemonic_XOR, 1, fixQuotSign, fixQuotSign, one);
    /* neg ebx      */  newInst(Mnemonic_NEG, 1, ebx, ebx);
    /* adc ecx, 0   */  newInst(Mnemonic_ADC, 1, ecx, ecx, zero);
    /* neg ecx      */  newInst(Mnemonic_NEG, 1, ecx, ecx);
    connectNodeTo(testBigDvsrNode);
    setCurrentNode(NULL);
    neg_B_Node = (Node*)0xDEADBEEF;

    //
    // testBigDvsr:
    //
                        // divisor > 2^32-1 ?
    setCurrentNode(testBigDvsrNode);
    /* cmp ecx, 0   */  newInst(Mnemonic_CMP, ecx, zero); 
                        // YES - proceed to the hard way.
                        // NO - fall to the simpler path
    /* jnz BIG_DVSOR*/  newBranch(Mnemonic_JNZ, bigDivisorNode, testOneDivNode);
    setCurrentNode(NULL);
    testBigDvsrNode = (Node*)0xDEADBEEF;

    //
    // testOneDivNode:
    //
    //
                        //only one division needed ? (ecx = 0)
    setCurrentNode(testOneDivNode);
    /* cmp edx, ebx */  newInst(Mnemonic_CMP, edx, ebx);
                        // YES - one division sufficient
    /* jb ONE_DIV  */   newBranch(Mnemonic_JB, oneDivNode, simpleDivNode);
    setCurrentNode(NULL);
    testOneDivNode = (Node*)0xDEADBEEF;
    
    //
    // simpleDivNode:
    //

    setCurrentNode(simpleDivNode);
    /* mov ecx, eax */  newInst(Mnemonic_MOV, ecx, eax);     // save dividend-lo in ecx
    /* mov eax, edx */  newInst(Mnemonic_MOV, eax, edx);     // get dividend-hi
    /* xor edx, edx */  newInst(Mnemonic_MOV, edx, zero);    // zero extend it into edx:eax
    /* div ebx      */  newInst(Mnemonic_DIV, 2, edx, eax, edx, eax, ebx); // quotient-hi in eax
    /* xchg eax, ecx*/  newInst(Mnemonic_XCHG, eax, ecx);    //ecx = quotient-hi, eax =dividend-lo
    connectNodeTo(oneDivNode);
    setCurrentNode(NULL);
    simpleDivNode = (Node*)0xDEADBEEF;

    // 
    // oneDivNode:
    //

    setCurrentNode(oneDivNode);
    /* div ebx */       newInst(Mnemonic_DIV, 2, edx, eax, edx, eax, ebx);  // eax = quotient-lo
    /* mov ebx, edx */  newInst(Mnemonic_MOV, ebx, edx);    // ebx = remainder-lo
    /* mov edx, ecx */  newInst(Mnemonic_MOV, edx, ecx);    // edx = quotient-hi(quotient in edx:eax)
    /* xor ecx, ecx */  newInst(Mnemonic_MOV, ecx, zero);   // ecx = remainder-hi (rem. in ecx:ebx)
    /* jmp saveResult*/
    connectNodeTo(storeResultNode);
    setCurrentNode(NULL);
    oneDivNode = (Node*)0xDEADBEEF;

    //
    // bigDivisorNode:
    //
    setCurrentNode(bigDivisorNode);
    /* mov temp_hi, edx */  newInst(Mnemonic_MOV, temp_hi, edx);
    /* mov temp_lo, eax */  newInst(Mnemonic_MOV, temp_lo, eax);    // save dividend
    /* mov esi, ebx     */  newInst(Mnemonic_MOV, esi, ebx);
    /* mov edi, ecx     */  newInst(Mnemonic_MOV, edi, ecx);
                            // ^^^divisor now in edi:ebx and ecx:esi

                            // shift both divisor and dividend right on 1 bit
    /* shr edx, 1       */  newInst(Mnemonic_SHR, edx, one);
    /* rcr eax, 1       */  newInst(Mnemonic_RCR, eax, one);
    /* ror edi, 1       */  newInst(Mnemonic_ROR, edi, one);
    /* rcr ebx, 1       */  newInst(Mnemonic_RCR, ebx, one);

    //
    // FIXME: workarounding WebMaker problem, which does not like 
    // both DEF and USE of the same arg in the same instruction.
    // Replace, 'reg = BSR reg, 0' with 
    // temp = reg ; reg = BSR temp, 0
    // Funny thing is that this only happens with BSR.
    // 
#if 0 // _FIXME_
    /* bsr ecx, ecx  */     newInst(Mnemonic_BSR, 1, ecx, ecx);
#else
    Opnd* tmp = irManager->newOpnd(ecx->getType());
    /* mov tmp, ecx  */     newInst(Mnemonic_MOV, 1, tmp, ecx);
    /* bsr ecx, tmp  */     newInst(Mnemonic_BSR, 1, ecx, tmp);
#endif
                            // ecx = number of remaining shifts

                            // scale divisor and dividend down, so divisor fits
                            // into EBX (<2^32)
    // FIXME: Currently, constraint resolver fails to assign 'ecx' operand to the 
    // real ECX, and this causes SpillGen failure.
    // Forcing the ECX right here:
#if 0 //_FIXME_
    /* shrd ebx, edi, CL*/  newInst(Mnemonic_SHRD, ebx, edi, ecx);
    /* shrd eax, edx, CL*/  newInst(Mnemonic_SHRD, eax, edx, ecx);
    /* shr  edx, CL   */    newInst(Mnemonic_SHR, edx, ecx);
#else
    Opnd* trueECX = irManager->getRegOpnd(RegName_ECX);
    newInst(Mnemonic_MOV, trueECX, ecx);
    //~FIXME
    /* shrd ebx, edi, CL*/  newInst(Mnemonic_SHRD, ebx, edi, trueECX);
    /* shrd eax, edx, CL*/  newInst(Mnemonic_SHRD, eax, edx, trueECX);
    /* shr  edx, CL   */    newInst(Mnemonic_SHR, edx, trueECX);
#endif
    /* rol  edi, 1    */    newInst(Mnemonic_ROL, edi, one);
                            // get quotient
    /* div    ebx     */    newInst(Mnemonic_DIV, 2, edx, eax, edx, eax, ebx, NULL);
    /* mov ebx, temp_lo*/   newInst(Mnemonic_MOV, ebx, temp_lo);
    /* mov ecx, eax    */   newInst(Mnemonic_MOV, ecx, eax);
    /* imul edi, eax   */   newInst(Mnemonic_IMUL, edi, eax);
    /* mul  esi        */   newInst(Mnemonic_MUL, 2, edx, eax, eax, esi, NULL);
    /* add edx, edi    */   newInst(Mnemonic_ADD, 1, edx, edx, edi);    // edx:eax = quotient * divisor
    /* sub ebx, eax    */   newInst(Mnemonic_SUB, 1, ebx, ebx, eax);    // dividend-lo - (quot.*divisor)-lo
    /* mov eax, ecx    */   newInst(Mnemonic_MOV, eax, ecx);
    /* mov ecx, temp_hi*/   newInst(Mnemonic_MOV, ecx, temp_hi);        // restore dividend hi-word
    /* sbb ecx, edx    */   newInst(Mnemonic_SBB, 1, ecx, ecx, edx);    // subtract divisor * quot. from dividend
    /* sbb edx, edx    */   newInst(Mnemonic_SBB, 1, edx, edx, edx);    // 0 if remainder > 0, else FFFFFFFFh
    /* and esi, edx    */   newInst(Mnemonic_AND, 1, esi, esi, edx);    // nothing to add
    /* and edi, edx    */   newInst(Mnemonic_AND, 1, edi, edi, edx);    // back if remainder positive
    /* add ebx, esi    */   newInst(Mnemonic_ADD, 1, ebx, ebx, esi);    // correct remainder
    /* adc ecx, edi    */   newInst(Mnemonic_ADC, 1, ecx, ecx, edi);    // and quotient if
    /* add eax, edx    */   newInst(Mnemonic_ADD, 1, eax, eax, edx);    // necessary
    /* xor edx, edx    */   newInst(Mnemonic_MOV, edx, zero);           // clear hi-word of quot (eax<=FFFFFFFFh)
    
    connectNodeTo(storeResultNode);
    setCurrentNode(NULL);
    bigDivisorNode = (Node*)0xDEADBEEF;

    Node* testFixQuotSignNode = newBB();
    Node* fixQuotSignNode = newBB();
    Node* testFixRemSignNode = newBB();
    Node* fixRemSignNode = newBB();
    Node* finitaNode = newBB();

    //
    // storeResultNode:
    // 
    Opnd* quot_lo = irManager->newOpnd(int32type);
    Opnd* quot_hi = irManager->newOpnd(int32type);
    Opnd* rem_lo = irManager->newOpnd(int32type);
    Opnd* rem_hi = irManager->newOpnd(int32type);
    
    setCurrentNode(storeResultNode);
    /* mov rem_lo, ebx*/    newInst(Mnemonic_MOV, rem_lo, ebx);
    /* mov rem_hi, ecx*/    newInst(Mnemonic_MOV, rem_hi, ecx);
    /* mov quot_lo, eax*/   newInst(Mnemonic_MOV, quot_lo, eax);
    /* mov quot_hi, edx*/   newInst(Mnemonic_MOV, quot_hi, edx);
    connectNodeTo(testFixQuotSignNode);
    setCurrentNode(NULL);
    storeResultNode = (Node*)0xDEADBEEF;

    //
    // testFixQuotSignNode:
    //
    //
    setCurrentNode(testFixQuotSignNode);
    /* cmp fixQuotSign, 0  */   newInst(Mnemonic_CMP, fixQuotSign, zero);
    /* jz testFixRemSignNode */ newBranch(Mnemonic_JZ, testFixRemSignNode, fixQuotSignNode);
    setCurrentNode(NULL);
    testFixQuotSignNode = (Node*)0xDEADBEEF;

    //
    // fixQuotSignNode:
    //
    setCurrentNode(fixQuotSignNode);
    /* neg res_lo      */  newInst(Mnemonic_NEG, 1, quot_lo, quot_lo);
    /* adc res_hi, 0   */  newInst(Mnemonic_ADC, 1, quot_hi, quot_hi, zero);
    /* neg res_lo      */  newInst(Mnemonic_NEG, 1, quot_hi, quot_hi);
    connectNodeTo(testFixRemSignNode);
    setCurrentNode(NULL);
    fixQuotSignNode = (Node*)0xDEADBEEF;

    //
    // testFixRemSignNode:
    //
    //
    setCurrentNode(testFixRemSignNode);
    /* cmp fixRemSign, 0  */    newInst(Mnemonic_CMP, fixRemSign, zero);
    /* jz finitaNode */         newBranch(Mnemonic_JZ, finitaNode, fixRemSignNode);
    setCurrentNode(NULL);
    testFixQuotSignNode = (Node*)0xDEADBEEF;

    //
    // fixRemSignNode:
    //
    setCurrentNode(fixRemSignNode);
    /* neg res_lo      */  newInst(Mnemonic_NEG, 1, rem_lo, rem_lo);
    /* adc res_hi, 0   */  newInst(Mnemonic_ADC, 1, rem_hi, rem_hi, zero);
    /* neg res_lo      */  newInst(Mnemonic_NEG, 1, rem_hi, rem_hi);
    connectNodeTo(finitaNode);
    setCurrentNode(NULL);
    fixQuotSignNode = (Node*)0xDEADBEEF;

    //
    // finitaNode:
    //
    setCurrentNode(finitaNode);
    if (wantReminder) {
        /* mov res_lo, rem_lo*/  newInst(Mnemonic_MOV, res_lo, rem_lo);
        /* mov res_hi, rem_hi*/  newInst(Mnemonic_MOV, res_hi, rem_hi);
    }
    else {
        /* mov res_lo, quot_lo*/  newInst(Mnemonic_MOV, res_lo, quot_lo);
        /* mov res_hi, quot_hi*/  newInst(Mnemonic_MOV, res_hi, quot_hi);
    }
    connectNodes(finitaNode, getSubCfgReturnNode());
    setCurrentNode(NULL);
    finitaNode = (Node*)0xDEADBEEF;

    Node* originalInstNode = inst->getNode();
    propagateSubCFG(inst);
    propagateDivRemResults(inst, originalInstNode, quot_lo, quot_hi, rem_lo, rem_hi);
}

void I8Lowerer::propagateDivRemResults(
    Inst* origInst, Node* originalInstNode,
    Opnd* quot_lo, Opnd* quot_hi,
    Opnd* rem_lo, Opnd* rem_hi)
{

    bool propagate_div_rem = false; //FIXME: somehow crashes on Linux/IA-32 in dominators - need to fix.
    getArg("propagate_div_rem", propagate_div_rem);
    if (!propagate_div_rem) {
        return;
    }

    assert(origInst->getOpndRoles(0) & Inst::OpndRole_Def);
    assert(origInst->getOpndRoles(1) & Inst::OpndRole_Use);
    assert(origInst->getOpndRoles(2) & Inst::OpndRole_Use);
    Opnd* src1 = origInst->getOpnd(1);
    Opnd* src2 = origInst->getOpnd(2);
    if (!isSingleDef(src1) || !isSingleDef(src2)) {
        // Not a single def operands - can't propagate with current 
        // trivial implementation, need more sophisticated routine.
        return;
    }

    ControlFlowGraph* cfg = irManager->getFlowGraph();
    DominatorTree* dt = cfg->getDominatorTree();

    INST_ARRAY& i8insts = *m_pI8Insts;
    for (INST_ARRAY::iterator i=i8insts.begin(); i != i8insts.end(); i++) {
        Inst* inst = *i;
        if (inst == origInst || inst == NULL) {
            continue;
        }
        if (!inst->hasKind(Inst::Kind_I8PseudoInst) || 
            inst->getMnemonic() != Mnemonic_IDIV) {
            continue;
        }
        assert(inst->getOpndRoles(0) & Inst::OpndRole_Def);
        assert(inst->getOpndRoles(1) & Inst::OpndRole_Use);
        assert(inst->getOpndRoles(2) & Inst::OpndRole_Use);
        Opnd* otherSrc1 = origInst->getOpnd(1);
        Opnd* otherSrc2 = origInst->getOpnd(2);
        if (otherSrc1 != src1 || otherSrc2 != src2) {
            continue;
        }
        //
        // Test whether both instructions belong to the same loop
        //
        Node* origNodeLoopHdr = (*m_pLoopInfos)[origInst];
        Node* instLoopHdr = (*m_pLoopInfos)[inst];
        if (origNodeLoopHdr != instLoopHdr)  {
            continue;
        }
        //
        // Test whether one instruction dominates another
        // 
        Node* instNode = inst->getNode();
        if (!dt->dominates(originalInstNode, instNode)) {
            continue;
        }
        Log::out() << "Propagating: I" << origInst->getId() << " => I" << inst->getId() << std::endl;
        Opnd* otherDst = inst->getOpnd(0);
        Opnd* otherDst_hi, *otherDst_lo;
        prepareNewOpnds(otherDst, otherDst_lo, otherDst_hi);
        const bool isRem = isI8RemInst(inst);
        Inst* ii;
        ii = irManager->newCopyPseudoInst(Mnemonic_MOV, otherDst_lo, isRem ? rem_lo : quot_lo);
        ii->insertBefore(inst);
        ii = irManager->newCopyPseudoInst(Mnemonic_MOV, otherDst_hi, isRem ? rem_hi : quot_hi);
        ii->insertBefore(inst);
        inst->unlink();
        *i = NULL;
    }
}

//IR verification routine.
//checks that there are no I8Pseudo insts left in CFG after the pass
static void checkIR(IRManager* irm) {
#ifdef _DEBUG
    const Nodes& nodes = irm->getFlowGraph()->getNodes();
    for (Nodes::const_iterator it = nodes.begin(), end = nodes.end(); it!=end; ++it) {
       Node* node = *it;
        for (Inst* inst = (Inst*)node->getFirstInst(); inst!=NULL; inst = inst->getNextInst()) {
            assert(!inst->hasKind(Inst::Kind_I8PseudoInst));
        }
    }
#endif
}

}}






